package com.ly.blog_data.utils;

import com.ly.blog_data.entity.Constant;
import lombok.extern.slf4j.Slf4j;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Html 解析操作
 * @author ly create at 2021/6/22 - 13:44
 **/
@Slf4j
public class HtmlInfoExplainUtils {

    private static Pattern pattern = Pattern.compile("charset=(\"?)(.+)\"");

    public static String getHtmlContent(String url) {
        return getHtmlContent(url,true);
    }

    /**
     * 根据 url 拿到html页面数据
     * @param url
     * @param removeLine true 去除换行符
     * @return
     */
    public static String getHtmlContent(String url,boolean removeLine) {

        log.info("【获取HTML页面数据】 --- 开始获取 -- url：{}",url);
        String charset = Constant.DEFAULT_CHARSET;

        URL urlObj = null;
        StringBuilder stringBuilder = new StringBuilder();
        InputStream in = null;
        try {

            urlObj = new URL(url);

            HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection();
            httpcon.addRequestProperty("User-Agent", "Mozilla/4.76");
            in = httpcon.getInputStream();

//            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in));
//            int lineCounter = 0;
//            String temp = "";
//
//            //两种： content="text/html; charset=gbk"  charset="gbk"
//            while ((temp = bufferedReader.readLine()) != null) {
//                if (lineCounter > 0) {
//                    stringBuilder.append(Constant.NEXT_LINE);
//                }
//                // 从 html 中获取编码信息
//                if (Constant.DEFAULT_CHARSET.equals(charset)) {
//                    Matcher matcher = pattern.matcher(new String(temp).toLowerCase());
//                    if (matcher.find()) {
//                        charset = matcher.group(2);
//                    }
//                }
//                lineCounter++;
//                stringBuilder.append(new String(temp.getBytes(), Charset.forName(charset)));
//            }
//

            BufferedInputStream bufferedReader = new BufferedInputStream(in);
            int lineCounter = 0;
            byte[] temp = new byte[1024];

            //两种： content="text/html; charset=gbk"  charset="gbk"
            while ((bufferedReader.read(temp)) != -1) {
                if (lineCounter > 0) {
                    stringBuilder.append(Constant.NEXT_LINE);
                }
                // 从 html 中获取编码信息
                if (Constant.DEFAULT_CHARSET.equals(charset)) {
                    Matcher matcher = pattern.matcher(new String(temp).toLowerCase());
                    if (matcher.find()) {
                        charset = matcher.group(2);
                    }
                }
                lineCounter++;
                stringBuilder.append(new String(temp, Charset.forName(charset)));
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(in != null){
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        String c = stringBuilder.toString();
        if(removeLine){
            c = c.replace("\r\n","").replace("\n","");
        }

        if(c == null || "null".equals(c)){
            log.info("【获取HTML页面数据】 --- 获取数据为空，修整五秒后，再次虎丘 -- url：{}",url);

            try {
                Thread.sleep(5000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

            return getHtmlContent(url);

        }
        log.info("【获取HTML页面数据】 --- 获取成功 -- url：{}",url);
        return c;

    }


}
